<?php
/**
 * Created by PhpStorm.
 * User: zhangjian37
 * Date: 2017/2/20
 * Time: 10:50
 */

//地址
$host="http://bj.58.com/";
//栏目
$topic="ershoufang/";
//页面号
$page_no="pn";
//参数
$params="utm_source=market";

//建立链接通道
function connectUrl($host,$topic,$page_no,$params,$mh){
    //先访问第一页，然后
    $links = array();

    //得到所有要访问的链接
    $links = pageAllLink($host,$topic,$page_no,$params,1,2,$mh,$links);

//    var_dump($links);
    return $links;
}

/**
 * 从html内容中筛选链接
 *
 * @param string $web_content
 * @return array
 */
function _filterUrl($web_content) {
    $reg_tag_a = '/<[a|A].*?href=[\'\"]{0,1}(http:\/\/bj\.58\.com\/[^>\'\"\ ]*).*?class="t".*?>/';
    $result = preg_match_all($reg_tag_a, $web_content, $match_result);
    if ($result) {
        return $match_result[1];
    }
}

function _filterPhone($web_content){
    $reg_tag_a = '/(1[0-9]{10})[\s]*?<\/span>/';
    $result = preg_match_all($reg_tag_a, $web_content, $match_result);
    if ($result) {
        return $match_result[1];
    }
}

/**
 * 每一次请求不超过5个页面
 * 如果是100个页面，则操作20次
 * @param $host
 * @param $topic
 * @param $page_no
 * @param $params
 * @param $start_page_no
 * @param $page_length
 * @param $mh
 */
function pageAllLink($host,$topic,$page_no,$params,$start_page_no,$page_length,$mh,$links){
    //一次最大请求页面数
    $maxLengthPreTime = 5;
    //循环次数
    $times = $page_length / $maxLengthPreTime;

    for($i=0;$i<$times;$i++){
        pageLink($host,$topic,$page_no,$params,($i*$maxLengthPreTime)+$start_page_no,$maxLengthPreTime,$mh);
        do {
            //运行当前 cURL 句柄的子连接
            while (($cme = curl_multi_exec($mh, $active)) == CURLM_CALL_MULTI_PERFORM);

            if ($cme != CURLM_OK) {break;}
            //获取当前解析的cURL的相关传输信息
            while ($done = curl_multi_info_read($mh))
            {
                $info = curl_getinfo($done['handle']);
                $tmp_result = curl_multi_getcontent($done['handle']);
                $error = curl_error($done['handle']);

                $temp_array = _filterUrl($tmp_result);
                foreach($temp_array as $k=>$v){
                    $links[] = $v;
                }

                curl_multi_remove_handle($mh, $done['handle']);
            }

            if ($active)
                curl_multi_select($mh, 10);
        } while ($active);
    }

    return $links;
}

/**
 * 每一次就处理不超过5个页面的数据
 *
 * @param $host
 * @param $topic
 * @param $page_no
 * @param $params
 * @param $start_page_no
 * @param $page_length
 * @param $mh
 */
function pageLink($host,$topic,$page_no,$params,$start_page_no,$page_length,$mh){
    for ($i = $start_page_no; $i < $start_page_no+$page_length; $i++) {
        $url = $host.$topic;
        if($i==1)$url.= "?".$params;
        else $url.= $page_no.$i."/?".$params;
        $ch = curl_init();  //初始化单个cURL会话
        curl_setopt($ch, CURLOPT_HEADER, 0);
        curl_setopt($ch, CURLOPT_URL, $url);
        //curl_setopt($ch, CURLOPT_COOKIE, self::$user_cookie);
        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36');
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
        curl_multi_add_handle($mh, $ch);  //向curl批处理会话中添加单独的curl句柄
    }
}



//通过链接访问房屋销售详情页，在得到销售人员名称和电话，地址
function getUserInfos($host,$topic,$page_no,$params){
    $mh = curl_multi_init(); //返回一个新cURL批处理句柄

    $userInfos = array();//手机号码为key
    $links = connectUrl($host,$topic,$page_no,$params,$mh);
//    var_dump($links);
//    $links = array();
//    $links[] = "http://bj.58.com/ershoufang/28630888436805x.shtml?utm_source=market&spm=b-31580022738699-me-f-824.bdpz_biaoti?psid=192528235194977149137628485&entinfo=28630888436805_0";
    //每一次20个页面
    for($i = 0;$i<count($links);$i++){
        echo $links[$i]."<br>";
        $ch = curl_init();  //初始化单个cURL会话
        curl_setopt($ch, CURLOPT_HEADER, 0);
        curl_setopt($ch, CURLOPT_URL, $links[$i]);
        //curl_setopt($ch, CURLOPT_COOKIE, self::$user_cookie);
        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36');
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
        curl_multi_add_handle($mh, $ch);  //向curl批处理会话中添加单独的curl句柄
        if($i>1&&$i%5==0){
            $userInfos = getUserInfo($mh,$userInfos);
            sleep(2);
        }
    }
    $userInfos = getUserInfo($mh,$userInfos);

//    var_dump($userInfos);

    curl_multi_close($mh);

    $myfile = fopen("58userlist", "w");
    $content = '';
    foreach($userInfos as $k=>$v){
        $content.=$k."\n";
    }
    fwrite($myfile,$content);
    fclose($myfile);
}

/**
 * 每一次请求不超过5个页面
 * 如果是100个页面，则操作20次
 * @param $host
 * @param $topic
 * @param $page_no
 * @param $params
 * @param $start_page_no
 * @param $page_length
 * @param $mh
 */
function getUserInfo($mh,$userInfos){
    do {
        //运行当前 cURL 句柄的子连接
        while (($cme = curl_multi_exec($mh, $active)) == CURLM_CALL_MULTI_PERFORM);

        if ($cme != CURLM_OK) {break;}
        //获取当前解析的cURL的相关传输信息
        while ($done = curl_multi_info_read($mh))
        {
            $info = curl_getinfo($done['handle']);
            $tmp_result = curl_multi_getcontent($done['handle']);
            $error = curl_error($done['handle']);

            try{
                $temp_array = _filterPhone($tmp_result);
//            var_dump($temp_array);
//                if(is_null($temp_array)){
//                    echo $tmp_result;
//                    break;
//                }
                foreach($temp_array as $k=>$v){
                    $userInfos[$v] = 1;
                }
            }catch (Exception $e){
                continue;
            }

            curl_multi_remove_handle($mh, $done['handle']);
        }

        if ($active)
            curl_multi_select($mh, 20);
    } while ($active);

//    var_dump($userInfos);
    return $userInfos;
}

//$mh = curl_multi_init();
//$links = connectUrl($host,$topic,$page_no,$params,$mh);
//var_dump($links);
//http://bj.58.com/ershoufang/29119439444539x.shtml?utm_source=market?psid=113651114194978299431891319&entinfo=29119439444539_0
getUserInfos($host,$topic,$page_no,$params);